Required packages
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
#repos='http://cran.muenster.r-project.org'
stdata = c("sp", "sf", "raster")
Stat_methods = c("glmnet", "ranger", "gbm", "xgboost", "party", "caret", "party", "gstat")
visual = c("RColorBrewer", "ggplot2", "corrplot", "tmap", "leaflet", "mapview", "pdp", "vip", "DT", "sparkline")
map = c("maptools")
tidy = c("devtools", "dplyr", "tidyr", "reshape2", "knitr")
other = c("countrycode", "htmlwidgets", "data.table", "Matrix", "GGally")
packages <- c(stdata, tidy, Stat_methods, visual, map, other)
ipak(packages)
## sp sf raster devtools dplyr
## TRUE TRUE TRUE TRUE TRUE
## tidyr reshape2 knitr glmnet ranger
## TRUE TRUE TRUE TRUE TRUE
## gbm xgboost party caret party
## TRUE TRUE TRUE TRUE TRUE
## gstat RColorBrewer ggplot2 corrplot tmap
## TRUE TRUE TRUE TRUE TRUE
## leaflet mapview pdp vip DT
## TRUE TRUE TRUE TRUE TRUE
## sparkline maptools countrycode htmlwidgets data.table
## TRUE TRUE TRUE TRUE TRUE
## Matrix GGally
## TRUE TRUE
Auxilary package with preprocessed data in dataframe.
install_github("mengluchu/globalLUR/globalLUR/globalLUR")
library(globalLUR)
ls("package:globalLUR")
## [1] "Brt_imp" "Brt_LUR" "cforest_LUR"
## [4] "countrywithppm" "create_ring" "ctree_LUR"
## [7] "error_matrix" "join_by_id" "Lasso"
## [10] "Lassoselected" "mechanical" "merge_roads"
## [13] "merged" "mergeraster2file" "plot_error"
## [16] "plot_rsq" "ppm2ug" "RDring_coef"
## [19] "removedips" "rf_imp" "rf_LUR"
## [22] "sampledf" "scatterplot" "subset_grep"
## [25] "univar_rsq" "xgboost_imp" "xgboost_LUR"
If the above is not successeful for MacOS users, the following and a restart of R may be needed
system('defaults write org.R-project.R force.LANG en_US.UTF-8')
Get 3 color pallets.
colorB = brewer.pal(7, "Greens")
display.brewer.pal(7,"Greens")
colorG = brewer.pal(11, "PiYG")
colorS = brewer.pal(11, "Spectral")
#set.seed(2)
Dataset:
night_value: annual mean \(NO_2\) (\(mg/m^3\)) at night time.
OMI_mean_filt: OMI column density, 2017 annual average.
# add data usethis::use_data()
data("merged")
data("countrywithppm") # countries with ppm (parts per million)
names(merged)
## [1] "LATITUDE" "LONGITUDE" "value_mean"
## [4] "OMI_mean_filt" "elevation" "pop1k"
## [7] "RSp" "temperature_2m_1" "temperature_2m_10"
## [10] "temperature_2m_11" "temperature_2m_12" "temperature_2m_2"
## [13] "temperature_2m_3" "temperature_2m_4" "temperature_2m_5"
## [16] "temperature_2m_6" "temperature_2m_7" "temperature_2m_8"
## [19] "temperature_2m_9" "wind_speed_10m_1" "wind_speed_10m_10"
## [22] "wind_speed_10m_11" "wind_speed_10m_12" "wind_speed_10m_2"
## [25] "wind_speed_10m_3" "wind_speed_10m_4" "wind_speed_10m_5"
## [28] "wind_speed_10m_6" "wind_speed_10m_7" "wind_speed_10m_8"
## [31] "wind_speed_10m_9" "pop5k" "pop3k"
## [34] "country" "ROAD_1_25" "ROAD_1_50"
## [37] "ROAD_1_100" "ROAD_1_300" "ROAD_1_500"
## [40] "ROAD_1_800" "ROAD_1_1000" "ROAD_1_3000"
## [43] "ROAD_1_5000" "ROAD_2_25" "ROAD_2_50"
## [46] "ROAD_2_100" "ROAD_2_300" "ROAD_2_500"
## [49] "ROAD_2_800" "ROAD_2_1000" "ROAD_2_3000"
## [52] "ROAD_2_5000" "ROAD_3_25" "ROAD_3_50"
## [55] "ROAD_3_100" "ROAD_3_300" "ROAD_3_500"
## [58] "ROAD_3_800" "ROAD_3_1000" "ROAD_3_3000"
## [61] "ROAD_3_5000" "ROAD_4_25" "ROAD_4_50"
## [64] "ROAD_4_100" "ROAD_4_300" "ROAD_4_500"
## [67] "ROAD_4_800" "ROAD_4_1000" "ROAD_4_3000"
## [70] "ROAD_4_5000" "ROAD_5_25" "ROAD_5_50"
## [73] "ROAD_5_100" "ROAD_5_300" "ROAD_5_500"
## [76] "ROAD_5_800" "ROAD_5_1000" "ROAD_5_3000"
## [79] "ROAD_5_5000" "I_1_25" "I_1_50"
## [82] "I_1_100" "I_1_300" "I_1_500"
## [85] "I_1_800" "I_1_1000" "I_1_3000"
## [88] "I_1_5000" "Tropomi_2018" "day_value"
## [91] "night_value" "ID"
merged %>% dplyr::select(value_mean, day_value, night_value) %>% summary()
## value_mean day_value night_value
## Min. : 1.559 Min. : 1.883 Min. : 0.4307
## 1st Qu.:17.262 1st Qu.:18.353 1st Qu.: 15.4550
## Median :26.164 Median :28.433 Median : 22.3804
## Mean :27.905 Mean :30.912 Mean : 23.5377
## 3rd Qu.:36.883 3rd Qu.:41.900 3rd Qu.: 29.6922
## Max. :93.885 Max. :94.266 Max. :100.5855
## NA's :5 NA's :1
#datatable(merged, rownames = FALSE, filter = "top", options = list(pageLength = 5, scrollX = T))
Fill missing data with NA
merged_1 = na_if(merged, -1)
Merge roads of different road types, here 3, 4, 5, the road length of these road types are aggregated. The original road types are substituted (with keep = T, they are remained).
merged_mr = merge_roads(merged_1, c(3, 4, 5), keep = F) # keep = T keeps the original roads.
names(merged_mr)
## [1] "ROAD_M345_25" "ROAD_M345_50" "ROAD_M345_100"
## [4] "ROAD_M345_300" "ROAD_M345_500" "ROAD_M345_800"
## [7] "ROAD_M345_1000" "ROAD_M345_3000" "ROAD_M345_5000"
## [10] "LATITUDE" "LONGITUDE" "value_mean"
## [13] "OMI_mean_filt" "elevation" "pop1k"
## [16] "RSp" "temperature_2m_1" "temperature_2m_10"
## [19] "temperature_2m_11" "temperature_2m_12" "temperature_2m_2"
## [22] "temperature_2m_3" "temperature_2m_4" "temperature_2m_5"
## [25] "temperature_2m_6" "temperature_2m_7" "temperature_2m_8"
## [28] "temperature_2m_9" "wind_speed_10m_1" "wind_speed_10m_10"
## [31] "wind_speed_10m_11" "wind_speed_10m_12" "wind_speed_10m_2"
## [34] "wind_speed_10m_3" "wind_speed_10m_4" "wind_speed_10m_5"
## [37] "wind_speed_10m_6" "wind_speed_10m_7" "wind_speed_10m_8"
## [40] "wind_speed_10m_9" "pop5k" "pop3k"
## [43] "country" "ROAD_1_25" "ROAD_1_50"
## [46] "ROAD_1_100" "ROAD_1_300" "ROAD_1_500"
## [49] "ROAD_1_800" "ROAD_1_1000" "ROAD_1_3000"
## [52] "ROAD_1_5000" "ROAD_2_25" "ROAD_2_50"
## [55] "ROAD_2_100" "ROAD_2_300" "ROAD_2_500"
## [58] "ROAD_2_800" "ROAD_2_1000" "ROAD_2_3000"
## [61] "ROAD_2_5000" "I_1_25" "I_1_50"
## [64] "I_1_100" "I_1_300" "I_1_500"
## [67] "I_1_800" "I_1_1000" "I_1_3000"
## [70] "I_1_5000" "Tropomi_2018" "day_value"
## [73] "night_value" "ID"
#numeric country
#inde_var$country=as.numeric(inde_var$country)
Visualize with tmap: convenient
locations_sf = st_as_sf(merged_mr, coords = c("LONGITUDE","LATITUDE"))
osm_valuemean = tm_shape(locations_sf) +
tm_dots( "value_mean", col = "value_mean", size = 0.05,
popup.vars = c("value_mean", "day_value", "night_value", "ROAD_2_100", "ROAD_2_5000")) + tm_view(basemaps = c('OpenStreetMap'))
#+tm_shape(lnd)+tm_lines()
tmap_save(osm_valuemean, "NO2mean.html")
Visualize with leaflet: more control. show Day/night ratio, red: day/night >1, blue, day/nigh <1
merged_fp = merged_mr %>% mutate(ratiodn = day_value/night_value) %>% mutate(color = ifelse(ratiodn > 1, "red", "blue"))
m = leaflet(merged_fp) %>%
addTiles() %>% addCircleMarkers(radius = ~value_mean/5, color = ~color, popup = ~as.character(value_mean),fill = FALSE) %>% addProviderTiles(providers$Esri.NatGeoWorldMap) %>% addMouseCoordinates() %>%
addHomeButton(ext = extent(116.2, 117, 39.7, 40), layer.name = "Beijing") %>% addHomeButton(ext = extent(5, 5.2, 52, 52.2), layer.name = "Utrecht")
saveWidget(m, file = "NO2daynight.html")
Boxplot
countryname = paste(merged_mr$country, countrycode(merged_mr$country, 'iso2c', 'country.name'), sep = ":")
#tag country with ppm
countryname_s_e=ifelse( merged_mr$country %in% countrywithppm[countrywithppm %in% merged_mr$country], paste(countryname,"*", sep = ""), countryname)
merged_mr$countryfullname = countryname_s_e
# use the median for colour
mergedmedian = merged_mr %>% group_by(country) %>% mutate(median = median(value_mean, na.rm = TRUE))
bp2 <- ggplot(mergedmedian, aes(x=countryfullname, y=value_mean, group=country)) +
labs(x = "Country", y = expression(paste("NO"[2], " ", mu, "g/", "m"^3)), cex = 1.5) +
geom_boxplot(aes(fill = median)) +
theme(text = element_text(size = 13), axis.text.x = element_text(angle = 90, hjust = 1)) + scale_fill_distiller(palette = "Spectral")
# scale_color_brewer(direction = 1)
print(bp2 + ylim(0, 100))
Plot the paired correlation, for road predictors, population, Tropomi. For DE, CN, and world
merged_mr %>% na.omit %>% filter(country == "DE") %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
merged_mr %>% na.omit %>% filter(country == "CN") %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
merged_mr %>% na.omit %>% dplyr::select(matches("_value|ROAD|pop|Trop")) %>% cor %>% corrplot(type = "upper", method = "pie", tl.cex = 0.7)
Spatial dependency
grd_sp <- as_Spatial(locations_sf)
dt.vgm = variogram(value_mean~1, grd_sp)
plot(dt.vgm)
countryvariogram = function(COUN, cutoff){
loca = locations_sf%>%filter(country == COUN)
grd_sp <- as_Spatial(loca)
dt.vgm = variogram(value_mean~1, grd_sp, cutoff = cutoff)
plot(dt.vgm)
}
countryvariogram("DE", 1)
countryvariogram("US", 1)
countryvariogram("CN", 1) # reason?
#Moran I test
#install.packages("ape", dependencies = TRUE)
#library(ape)
#merged_mrf = merged_mr%>%filter(country == "US")
#no2.dists <- as.matrix(dist(cbind(merged_mrf$LONGITUDE, merged_mrf$LATITUDE)))
#no2.dists[1:5, 1:5]
#no2.inv <- 1/no2.dists
#diag(no2.inv) <- 0
#no2.inv[1:5, 1:5]
#Moran.I(merged_mrf$value_mean, na.rm = T, no2.inv)
Separate the dataset into training and test dataset with a fraction (her 80% of the records are used for training, the rest for testing), “DE” is the two digit for germany. If for world, the sampling uses the fraction per country.
#merged = merge(merged, stat[,-which(names(stat)%in%c("LATITUTE", "LONGITUDE"))], by = "ID", all.x = T)
response_predictor = globalLUR::sampledf(merged_mr, fraction = 0.8, country2digit = "DE", grepstring_rm = "ID|LATITUDE|LONGITUDE|countryfullname")
#Retrieve test, training, and all variables.
test = response_predictor$test
training = response_predictor$training
inde_var = response_predictor$inde_var
inde_var = inde_var %>% dplyr::select(-country)
The size of test and training dataset
length(test)
## [1] 67
length(training)
## [1] 267
The paired correlation between dependent (mean, day, night) and independent variables. How much information does R-squared tell you?
#Checkt uni-variant R square. Caculate the r-sq for day, night and mean, and bind the columns to form a dataframe for plotting.
rsqmean = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>% univar_rsq (inde_var$value_mean)
rsqday = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>% univar_rsq (inde_var$day_value)
rsqnight = inde_var %>% dplyr::select(-matches("value_mean|day_|night_")) %>% univar_rsq (inde_var$night_value)
rsqdf = cbind(rsqmean, rsqday, rsqnight, rownames(rsqmean))
names(rsqdf)= c("mean","day","night","vars")
plot_rsq(rsqdf = rsqdf, varname = "vars",xlab = "predictors", ylab = "R-squared")
#How does it compare to the vairable importance estimated from LASSO, RF, SGB, XGB, etc.
The scatter plots between predictors and responses, mean loess: moving regression, non-parametric, each smoothed value is given by a weighted linear least squares regression over the span.
inde_var %>% dplyr::select(matches("ROAD_M345_3000|pop3k|ROAD_2_50$|temperature_2m_7|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "loess")
inde_var %>% dplyr::select(matches("Tro|OMI|Rsp|night_value")) %>% scatterplot(y_name = "night_value", fittingmethod = "loess")
# why?
# can choose any variable to look at the scatterplot
#inde_var %>% dplyr::select(matches("ROAD_1|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "gam")
Discussion 1, try different scatterplot and discuss about the findings
inde_var %>% names
## [1] "ROAD_M345_25" "ROAD_M345_50" "ROAD_M345_100"
## [4] "ROAD_M345_300" "ROAD_M345_500" "ROAD_M345_800"
## [7] "ROAD_M345_1000" "ROAD_M345_3000" "ROAD_M345_5000"
## [10] "value_mean" "OMI_mean_filt" "elevation"
## [13] "pop1k" "RSp" "temperature_2m_1"
## [16] "temperature_2m_10" "temperature_2m_11" "temperature_2m_12"
## [19] "temperature_2m_2" "temperature_2m_3" "temperature_2m_4"
## [22] "temperature_2m_5" "temperature_2m_6" "temperature_2m_7"
## [25] "temperature_2m_8" "temperature_2m_9" "wind_speed_10m_1"
## [28] "wind_speed_10m_10" "wind_speed_10m_11" "wind_speed_10m_12"
## [31] "wind_speed_10m_2" "wind_speed_10m_3" "wind_speed_10m_4"
## [34] "wind_speed_10m_5" "wind_speed_10m_6" "wind_speed_10m_7"
## [37] "wind_speed_10m_8" "wind_speed_10m_9" "pop5k"
## [40] "pop3k" "ROAD_1_25" "ROAD_1_50"
## [43] "ROAD_1_100" "ROAD_1_300" "ROAD_1_500"
## [46] "ROAD_1_800" "ROAD_1_1000" "ROAD_1_3000"
## [49] "ROAD_1_5000" "ROAD_2_25" "ROAD_2_50"
## [52] "ROAD_2_100" "ROAD_2_300" "ROAD_2_500"
## [55] "ROAD_2_800" "ROAD_2_1000" "ROAD_2_3000"
## [58] "ROAD_2_5000" "I_1_25" "I_1_50"
## [61] "I_1_100" "I_1_300" "I_1_500"
## [64] "I_1_800" "I_1_1000" "I_1_3000"
## [67] "I_1_5000" "Tropomi_2018" "day_value"
## [70] "night_value"
inde_var %>% dplyr::select(matches("ROAD_1_500$|day_value")) %>% scatterplot(y_name = "day_value", fittingmethod = "gam")
Extra: 5) Separate urban/rural hirachical/ two-step linear regression 6) mixed effects regression
If simply using linear regression, the mean, day, night. Predictors are population, temperature, wind speed, GEOM product, OMI tropo column, elevation, and road buffers.
i.e. ROAD|population|value_mean|temperature|wind|GEOM product|OMI|elevation.
Note population is not always significant, though the individual R square for each buffer is high. The prediction for night is much better than for the day
inde_var_train = subset_grep(inde_var[training, ], "ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi|value_mean")
The tree and prediction error will be different if shuffeling training and testing data.
for (i in 2:5)
{
set.seed(i)
testtree = globalLUR::sampledf(merged_mr,fraction = 0.8, "DE" )
with (testtree, ctree_LUR(inde_var, y_varname= c("day_value"), training = training, test = test, grepstring ="ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi" ))
}
Creates diverse set of trees because 1) trees are unstable w.r.t. changes in learning/training data (bagging) 2) randomly preselect mtry splitting variables in each split
model training and parameter tuning
#caret
names(getModelInfo())
## [1] "ada" "AdaBag" "AdaBoost.M1"
## [4] "adaboost" "amdai" "ANFIS"
## [7] "avNNet" "awnb" "awtan"
## [10] "bag" "bagEarth" "bagEarthGCV"
## [13] "bagFDA" "bagFDAGCV" "bam"
## [16] "bartMachine" "bayesglm" "binda"
## [19] "blackboost" "blasso" "blassoAveraged"
## [22] "bridge" "brnn" "BstLm"
## [25] "bstSm" "bstTree" "C5.0"
## [28] "C5.0Cost" "C5.0Rules" "C5.0Tree"
## [31] "cforest" "chaid" "CSimca"
## [34] "ctree" "ctree2" "cubist"
## [37] "dda" "deepboost" "DENFIS"
## [40] "dnn" "dwdLinear" "dwdPoly"
## [43] "dwdRadial" "earth" "elm"
## [46] "enet" "evtree" "extraTrees"
## [49] "fda" "FH.GBML" "FIR.DM"
## [52] "foba" "FRBCS.CHI" "FRBCS.W"
## [55] "FS.HGD" "gam" "gamboost"
## [58] "gamLoess" "gamSpline" "gaussprLinear"
## [61] "gaussprPoly" "gaussprRadial" "gbm_h2o"
## [64] "gbm" "gcvEarth" "GFS.FR.MOGUL"
## [67] "GFS.LT.RS" "GFS.THRIFT" "glm.nb"
## [70] "glm" "glmboost" "glmnet_h2o"
## [73] "glmnet" "glmStepAIC" "gpls"
## [76] "hda" "hdda" "hdrda"
## [79] "HYFIS" "icr" "J48"
## [82] "JRip" "kernelpls" "kknn"
## [85] "knn" "krlsPoly" "krlsRadial"
## [88] "lars" "lars2" "lasso"
## [91] "lda" "lda2" "leapBackward"
## [94] "leapForward" "leapSeq" "Linda"
## [97] "lm" "lmStepAIC" "LMT"
## [100] "loclda" "logicBag" "LogitBoost"
## [103] "logreg" "lssvmLinear" "lssvmPoly"
## [106] "lssvmRadial" "lvq" "M5"
## [109] "M5Rules" "manb" "mda"
## [112] "Mlda" "mlp" "mlpKerasDecay"
## [115] "mlpKerasDecayCost" "mlpKerasDropout" "mlpKerasDropoutCost"
## [118] "mlpML" "mlpSGD" "mlpWeightDecay"
## [121] "mlpWeightDecayML" "monmlp" "msaenet"
## [124] "multinom" "mxnet" "mxnetAdam"
## [127] "naive_bayes" "nb" "nbDiscrete"
## [130] "nbSearch" "neuralnet" "nnet"
## [133] "nnls" "nodeHarvest" "null"
## [136] "OneR" "ordinalNet" "ordinalRF"
## [139] "ORFlog" "ORFpls" "ORFridge"
## [142] "ORFsvm" "ownn" "pam"
## [145] "parRF" "PART" "partDSA"
## [148] "pcaNNet" "pcr" "pda"
## [151] "pda2" "penalized" "PenalizedLDA"
## [154] "plr" "pls" "plsRglm"
## [157] "polr" "ppr" "PRIM"
## [160] "protoclass" "qda" "QdaCov"
## [163] "qrf" "qrnn" "randomGLM"
## [166] "ranger" "rbf" "rbfDDA"
## [169] "Rborist" "rda" "regLogistic"
## [172] "relaxo" "rf" "rFerns"
## [175] "RFlda" "rfRules" "ridge"
## [178] "rlda" "rlm" "rmda"
## [181] "rocc" "rotationForest" "rotationForestCp"
## [184] "rpart" "rpart1SE" "rpart2"
## [187] "rpartCost" "rpartScore" "rqlasso"
## [190] "rqnc" "RRF" "RRFglobal"
## [193] "rrlda" "RSimca" "rvmLinear"
## [196] "rvmPoly" "rvmRadial" "SBC"
## [199] "sda" "sdwd" "simpls"
## [202] "SLAVE" "slda" "smda"
## [205] "snn" "sparseLDA" "spikeslab"
## [208] "spls" "stepLDA" "stepQDA"
## [211] "superpc" "svmBoundrangeString" "svmExpoString"
## [214] "svmLinear" "svmLinear2" "svmLinear3"
## [217] "svmLinearWeights" "svmLinearWeights2" "svmPoly"
## [220] "svmRadial" "svmRadialCost" "svmRadialSigma"
## [223] "svmRadialWeights" "svmSpectrumString" "tan"
## [226] "tanSearch" "treebag" "vbmpRadial"
## [229] "vglmAdjCat" "vglmContRatio" "vglmCumulative"
## [232] "widekernelpls" "WM" "wsrf"
## [235] "xgbDART" "xgbLinear" "xgbTree"
## [238] "xyf"
inde_var_train = subset_grep(inde_var[training, ], "ROAD|pop|temp|wind|Rsp|OMI|eleva|coast|I_1|Tropomi|value_mean")
Training RF using Caret
Mtry
model_rf = train(value_mean ~ ., data = inde_var_train, method='rf') # mtry
plot(model_rf)
fitted <- predict(model_rf, inde_var[test, ])
error_matrix(prediction = fitted, validation = inde_var[test, ]$value_mean)
custom a tunning grid for tunning and control the resampling method
trl <- trainControl(method = "cv", number = 3) # control the resampling method
tgrid <- expand.grid(
.mtry = 2:4,
.splitrule = "variance",
.min.node.size = c(10, 20)
)
model_rf = train(value_mean ~ ., data = inde_var_train, method='ranger', trControl = trl, tuneGrid= tgrid)
model_rf
## Random Forest
##
## 267 samples
## 66 predictor
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 179, 178, 177
## Resampling results across tuning parameters:
##
## mtry min.node.size RMSE Rsquared MAE
## 2 10 8.997801 0.5980968 7.047345
## 2 20 9.055365 0.6073703 7.106884
## 3 10 8.843950 0.6000990 6.833279
## 3 20 8.866068 0.6104239 6.860434
## 4 10 8.817259 0.5975289 6.751506
## 4 20 8.777117 0.6109004 6.740419
##
## Tuning parameter 'splitrule' was held constant at a value of variance
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 4, splitrule =
## variance and min.node.size = 20.
try after the work shop as it takes quite a while
model_gbm = train(value_mean ~ ., data = inde_var[training, ], method='gbm')
plot(model_gbm)
#gbm.step optimal number of trees.
Also computational intensive
#install.packages("caretEnsemble")
library(caretEnsemble)
# Stacking Algorithms - Run multiple algos in one call.
trainControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 2,
savePredictions = TRUE,
classProbs = TRUE)
algorithmList <- c('rf', 'adaboost', 'earth', 'xgbDART', 'svmRadial')
set.seed(100)
models <- caretList(value_mean ~ ., data = inde_var_train, trControl = trainControl, methodList = algorithmList)
results <- resamples(models)
summary(results)
set.seed(2)
vip::list_metrics()
## Metric Description Task
## 1 auc Area under (ROC) curve Binary classification
## 2 error Misclassification error Binary classification
## 3 logloss Log loss Binary classification
## 4 mauc Multiclass area under (ROC) curve Multiclass classification
## 5 mlogloss Multiclass log loss Multiclass classification
## 6 mse Mean squared error Regression
## 7 r2 R squared Regression
## 8 rsquared R squared Regression
## 9 rmse Root mean squared error Regression
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
rf = ranger(value_mean~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
rf
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
##
## Type: Regression
## Number of trees: 2000
## Sample size: 267
## Number of independent variables: 65
## Mtry: 33
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 67.38482
## R squared (OOB): 0.6275708
# ranger method
importance(rf)
## ROAD_M345_25 ROAD_M345_50 ROAD_M345_100 ROAD_M345_300
## 2.9799495485 1.3795299058 3.3940733982 6.9000142949
## ROAD_M345_500 ROAD_M345_800 ROAD_M345_1000 ROAD_M345_3000
## 1.1003281775 1.3300478786 1.6184288926 32.8578750899
## ROAD_M345_5000 elevation pop1k temperature_2m_1
## 4.5041510764 0.7564435364 5.9485215629 0.0458152339
## temperature_2m_10 temperature_2m_11 temperature_2m_12 temperature_2m_2
## 0.1406238476 0.1489907867 0.2314376158 0.0599072699
## temperature_2m_3 temperature_2m_4 temperature_2m_5 temperature_2m_6
## 0.3615865478 0.0847134856 0.1328419768 0.0786573230
## temperature_2m_7 temperature_2m_8 temperature_2m_9 wind_speed_10m_1
## 0.0324254214 -0.0914887448 0.2954260567 -0.2989050079
## wind_speed_10m_10 wind_speed_10m_11 wind_speed_10m_12 wind_speed_10m_2
## 0.5854623248 0.0271890399 -0.1639054988 0.5555364295
## wind_speed_10m_3 wind_speed_10m_4 wind_speed_10m_5 wind_speed_10m_6
## 0.8616855051 -0.0228743631 0.0364021986 0.1305959695
## wind_speed_10m_7 wind_speed_10m_8 wind_speed_10m_9 pop5k
## 0.5964910347 0.8962120589 0.2005994900 4.7443086592
## pop3k ROAD_1_25 ROAD_1_50 ROAD_1_100
## 13.3043280125 0.2798327004 0.2741238798 0.3495866779
## ROAD_1_300 ROAD_1_500 ROAD_1_800 ROAD_1_1000
## 0.1822082537 0.3844218291 0.5830288337 0.1316841708
## ROAD_1_3000 ROAD_1_5000 ROAD_2_25 ROAD_2_50
## 1.6731827011 3.2101014723 6.3084451401 34.9422688357
## ROAD_2_100 ROAD_2_300 ROAD_2_500 ROAD_2_800
## 9.1977217496 2.2099114323 0.6560607977 0.0974125444
## ROAD_2_1000 ROAD_2_3000 ROAD_2_5000 I_1_25
## 0.0164787415 0.6608285985 0.5008840752 0.0023079289
## I_1_50 I_1_100 I_1_300 I_1_500
## 0.0034523932 0.0003712221 -0.0020227523 0.0964720825
## I_1_800 I_1_1000 I_1_3000 I_1_5000
## -0.0237811790 -0.1536266182 0.8585707656 1.6418053340
## Tropomi_2018
## 4.7613640160
#vip
DF_P_r2 = vi(rf, method = "permute", target = "value_mean", metric = "r2" ) # very clear what method is used and what is the target
DF_P_rmse = vi(rf, method = "permute", target = "value_mean", metric = "rmse")
vip (DF_P_rmse)
vip (DF_P_r2)
with(inde_var, cor(ROAD_M345_3000, ROAD_2_50))
## [1] 0.3211617
with(inde_var, cor(ROAD_M345_3000, pop3k))
## [1] 0.8348217
partial dependence plots: all the variables. (using sparklines, takes a while)
a=add_sparklines(DF_P_r2, fit = rf)
saveWidget(a, file="sparkline.html")
Partial dependence plot of selected
pre_mat_s = inde_var_train %>% select(value_mean, ROAD_2_50, pop3k, ROAD_M345_300)
lm_s = lm(value_mean~., data = pre_mat_s)
rf_s = ranger(value_mean~ ., data = pre_mat_s, num.trees = 2000, importance = "permutation")
rf_s
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = pre_mat_s, num.trees = 2000, importance = "permutation")
##
## Type: Regression
## Number of trees: 2000
## Sample size: 267
## Number of independent variables: 3
## Mtry: 1
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 74.92001
## R squared (OOB): 0.5859245
correlation
pre_mat_predictor = pre_mat_s%>%select(-value_mean)
ggpairs(pre_mat_predictor)
p_lm = partial(lm_s, "ROAD_M345_300",plot = TRUE, rug = TRUE)
plot(p_lm)
p2 = partial(rf_s, "ROAD_M345_300",plot = TRUE, rug = TRUE)
plot(p2)
#slow
pd <- partial(rf_s, pred.var = c("pop3k", "ROAD_M345_300" ))
# Default PDP
pd1 = plotPartial(pd)
# Add contour lines and use a different color palette
rwb <- colorRampPalette(c("red", "white", "blue"))
pdp2 = plotPartial(pd, contour = TRUE, col.regions = rwb)
#3-D surface
#pdp3 <- plotPartial(pd, levelplot = F, zlab = "ROAD_1_50", colorkey = T,
# screen = list(z = -20, x = -60) )
p3 = partial(rf_s, "ROAD_2_50", plot = TRUE, rug = TRUE)
p1 = partial(rf_s, "pop3k", plot = TRUE, rug = TRUE)
grid.arrange(p1, p2, p3, pd1, pdp2, ncol = 2)
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
gbm1 = gbm(formula = value_mean~., data = pre_mat, distribution = "gaussian", n.trees = 2000,
interaction.depth = 6, shrinkage = 0.01, bag.fraction = 0.5 )
names(pre_mat)
## [1] "ROAD_M345_25" "ROAD_M345_50" "ROAD_M345_100"
## [4] "ROAD_M345_300" "ROAD_M345_500" "ROAD_M345_800"
## [7] "ROAD_M345_1000" "ROAD_M345_3000" "ROAD_M345_5000"
## [10] "value_mean" "elevation" "pop1k"
## [13] "temperature_2m_1" "temperature_2m_10" "temperature_2m_11"
## [16] "temperature_2m_12" "temperature_2m_2" "temperature_2m_3"
## [19] "temperature_2m_4" "temperature_2m_5" "temperature_2m_6"
## [22] "temperature_2m_7" "temperature_2m_8" "temperature_2m_9"
## [25] "wind_speed_10m_1" "wind_speed_10m_10" "wind_speed_10m_11"
## [28] "wind_speed_10m_12" "wind_speed_10m_2" "wind_speed_10m_3"
## [31] "wind_speed_10m_4" "wind_speed_10m_5" "wind_speed_10m_6"
## [34] "wind_speed_10m_7" "wind_speed_10m_8" "wind_speed_10m_9"
## [37] "pop5k" "pop3k" "ROAD_1_25"
## [40] "ROAD_1_50" "ROAD_1_100" "ROAD_1_300"
## [43] "ROAD_1_500" "ROAD_1_800" "ROAD_1_1000"
## [46] "ROAD_1_3000" "ROAD_1_5000" "ROAD_2_25"
## [49] "ROAD_2_50" "ROAD_2_100" "ROAD_2_300"
## [52] "ROAD_2_500" "ROAD_2_800" "ROAD_2_1000"
## [55] "ROAD_2_3000" "ROAD_2_5000" "I_1_25"
## [58] "I_1_50" "I_1_100" "I_1_300"
## [61] "I_1_500" "I_1_800" "I_1_1000"
## [64] "I_1_3000" "I_1_5000" "Tropomi_2018"
summary(gbm1)
## var rel.inf
## ROAD_M345_3000 ROAD_M345_3000 16.04407753
## ROAD_2_50 ROAD_2_50 13.22190588
## ROAD_1_5000 ROAD_1_5000 4.87459613
## pop1k pop1k 4.39697062
## pop3k pop3k 4.33211390
## ROAD_M345_300 ROAD_M345_300 3.51850569
## Tropomi_2018 Tropomi_2018 3.50624238
## pop5k pop5k 3.48337075
## ROAD_M345_100 ROAD_M345_100 3.38857894
## ROAD_M345_5000 ROAD_M345_5000 3.32140774
## ROAD_M345_25 ROAD_M345_25 2.20961520
## ROAD_M345_1000 ROAD_M345_1000 2.02533032
## ROAD_1_3000 ROAD_1_3000 1.97747762
## I_1_5000 I_1_5000 1.70858529
## I_1_3000 I_1_3000 1.54883565
## ROAD_M345_800 ROAD_M345_800 1.52194262
## ROAD_M345_500 ROAD_M345_500 1.50755398
## ROAD_2_5000 ROAD_2_5000 1.45724423
## ROAD_M345_50 ROAD_M345_50 1.44260595
## ROAD_2_3000 ROAD_2_3000 1.31084123
## wind_speed_10m_7 wind_speed_10m_7 1.30361673
## ROAD_2_100 ROAD_2_100 1.22070954
## ROAD_2_25 ROAD_2_25 1.21112277
## I_1_1000 I_1_1000 1.13741256
## ROAD_1_300 ROAD_1_300 1.13401983
## elevation elevation 1.08042895
## ROAD_2_800 ROAD_2_800 1.06635450
## ROAD_2_1000 ROAD_2_1000 1.01911125
## ROAD_2_300 ROAD_2_300 0.98481817
## wind_speed_10m_3 wind_speed_10m_3 0.83578807
## ROAD_1_800 ROAD_1_800 0.80360507
## I_1_800 I_1_800 0.74566692
## temperature_2m_8 temperature_2m_8 0.71759123
## ROAD_2_500 ROAD_2_500 0.69915094
## ROAD_1_500 ROAD_1_500 0.59740058
## ROAD_1_1000 ROAD_1_1000 0.57653849
## temperature_2m_4 temperature_2m_4 0.55618011
## wind_speed_10m_11 wind_speed_10m_11 0.54419914
## temperature_2m_9 temperature_2m_9 0.50433042
## temperature_2m_6 temperature_2m_6 0.49637947
## temperature_2m_5 temperature_2m_5 0.49371090
## wind_speed_10m_4 wind_speed_10m_4 0.49047107
## wind_speed_10m_1 wind_speed_10m_1 0.47565396
## temperature_2m_2 temperature_2m_2 0.46449627
## temperature_2m_3 temperature_2m_3 0.44565285
## wind_speed_10m_9 wind_speed_10m_9 0.40161661
## wind_speed_10m_8 wind_speed_10m_8 0.37871466
## I_1_500 I_1_500 0.36730367
## wind_speed_10m_10 wind_speed_10m_10 0.32734098
## temperature_2m_7 temperature_2m_7 0.30061188
## temperature_2m_12 temperature_2m_12 0.29655925
## I_1_300 I_1_300 0.24465515
## temperature_2m_1 temperature_2m_1 0.23641507
## wind_speed_10m_12 wind_speed_10m_12 0.23300537
## wind_speed_10m_5 wind_speed_10m_5 0.18083836
## wind_speed_10m_6 wind_speed_10m_6 0.17585163
## wind_speed_10m_2 wind_speed_10m_2 0.17027715
## temperature_2m_10 temperature_2m_10 0.16116812
## temperature_2m_11 temperature_2m_11 0.10583026
## I_1_100 I_1_100 0.01760041
## ROAD_1_25 ROAD_1_25 0.00000000
## ROAD_1_50 ROAD_1_50 0.00000000
## ROAD_1_100 ROAD_1_100 0.00000000
## I_1_25 I_1_25 0.00000000
## I_1_50 I_1_50 0.00000000
plot(gbm1, i.var = 2:3)
#plot(gbm1, i.var = 1)
#rf_residual <- pre_rf - rdf_test$NO2
Xgboost
Tunning XGBoost is more complex (as it has a lot more hyperparameters to tune): https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
gamma[default=0][range: (0,Inf)] It controls regularization (or prevents overfitting). The optimal value of gamma depends on the data set and other parameter values. Higher the value, higher the regularization. Regularization means penalizing large coefficients which don’t improve the model’s performance. default = 0 means no regularization. Tune trick: Start with 0 and check CV error rate. If you see train error >>> test error, bring gamma into action.
lambda and Alpha
nrounds[default=100] It controls the maximum number of iterations. For classification, it is similar to the number of trees to grow. Should be tuned using CV
eta[default=0.3][range: (0,1)] It controls the learning rate, i.e., the rate at which our model learns patterns in data. After every round, it shrinks the feature weights to reach the best optimum. Lower eta leads to slower computation. It must be supported by increase in nrounds. Typically, it lies between 0.01 - 0.3
max_depth[default=6][range: (0,Inf)] It controls the depth of the tree. Larger data sets require deep trees to learn the rules from data. Should be tuned using CV
y_varname= "day_value"
varstring = "ROAD|pop|temp|wind|RSp|OMI|eleva|coast|I_1|Tropo"
prenres = paste(y_varname, "|", varstring, sep = "")
sub_mat = subset_grep(inde_var, prenres)
x_train = sub_mat[training, ]
y_train = sub_mat[training, y_varname]
x_test = sub_mat[test, ]
y_test = sub_mat[test, y_varname]
df_train = data.table(x_train, keep.rownames = F)
df_test = data.table(x_test, keep.rownames = F)
formu = as.formula(paste(y_varname, "~.", sep = ""))
dfmatrix_train = sparse.model.matrix(formu, data = df_train)[, -1]
dfmatrix_test = sparse.model.matrix(formu, data = df_test)[, -1]
train_b = xgb.DMatrix(data = dfmatrix_train, label = y_train)
test_b = xgb.DMatrix(data = dfmatrix_test, label = y_test)
params <- list(booster = "gbtree",max_depth = 4,
eta = 0.05,
nthread = 2,
nrounds = 1000,
Gamma = 2)
#xgb_t = xgb.train (params = params, data = train_b, nrounds = 500, watchlist = list(val=test_b, train = train_b), print_every_n = 10, early_stopping_rounds = 50, maximize = F , eval_metric = "rmse")
#outputvec = inde_var[training, y_varname]
max_depth = 4
eta = 0.01
nthread = 4
nrounds = 1000
Gamma = 2
#simplest: tunning of rounds
xgbcv <- xgb.cv( data = train_b, nfold = 10, nround = nrounds, eta = eta, nthread = nthread, Gamma = Gamma,showsd = T, stratified = T, print_every_n = 200, early_stopping_rounds = 50, maximize = F)
## [1] train-rmse:29.996885+0.438709 test-rmse:29.628804+4.523377
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 50 rounds.
##
## [201] train-rmse:5.801698+0.088228 test-rmse:11.572709+2.704056
## [401] train-rmse:1.553054+0.046746 test-rmse:10.603132+2.239456
## [601] train-rmse:0.676210+0.032505 test-rmse:10.535816+2.178796
## Stopping. Best iteration:
## [612] train-rmse:0.652654+0.031204 test-rmse:10.534662+2.178269
xgbcv
## ##### xgb.cv 10-folds
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 1 29.9968846 0.43870878 29.62880 4.523377
## 2 29.7299276 0.43481263 29.37897 4.486734
## 3 29.4656318 0.43085778 29.13682 4.450636
## 4 29.2041945 0.42713978 28.88826 4.420785
## 5 28.9453251 0.42339977 28.64471 4.393379
## ---
## 658 0.5671022 0.02650556 10.53705 2.175976
## 659 0.5653967 0.02636010 10.53711 2.175952
## 660 0.5637832 0.02631878 10.53724 2.175795
## 661 0.5622427 0.02656286 10.53721 2.175976
## 662 0.5606334 0.02640172 10.53733 2.175752
## Best iteration:
## iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
## 612 0.6526541 0.03120427 10.53466 2.178269
str(xgbcv)
## List of 9
## $ call : language xgb.cv(data = train_b, nrounds = nrounds, nfold = 10, showsd = T, stratified = T, print_every_n = 200, early| __truncated__ ...
## $ params :List of 4
## ..$ eta : num 0.01
## ..$ nthread: num 4
## ..$ Gamma : num 2
## ..$ silent : num 1
## $ callbacks :List of 3
## ..$ cb.print.evaluation:function (env = parent.frame())
## .. ..- attr(*, "call")= language cb.print.evaluation(period = print_every_n, showsd = showsd)
## .. ..- attr(*, "name")= chr "cb.print.evaluation"
## ..$ cb.evaluation.log :function (env = parent.frame(), finalize = FALSE)
## .. ..- attr(*, "call")= language cb.evaluation.log()
## .. ..- attr(*, "name")= chr "cb.evaluation.log"
## ..$ cb.early.stop :function (env = parent.frame(), finalize = FALSE)
## .. ..- attr(*, "call")= language cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, verbose = verbose)
## .. ..- attr(*, "name")= chr "cb.early.stop"
## $ evaluation_log :Classes 'data.table' and 'data.frame': 662 obs. of 5 variables:
## ..$ iter : num [1:662] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ train_rmse_mean: num [1:662] 30 29.7 29.5 29.2 28.9 ...
## ..$ train_rmse_std : num [1:662] 0.439 0.435 0.431 0.427 0.423 ...
## ..$ test_rmse_mean : num [1:662] 29.6 29.4 29.1 28.9 28.6 ...
## ..$ test_rmse_std : num [1:662] 4.52 4.49 4.45 4.42 4.39 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
## $ niter : int 662
## $ nfeatures : int 67
## $ folds :List of 10
## ..$ : int [1:26] 99 49 148 193 65 257 16 10 27 169 ...
## ..$ : int [1:26] 95 259 46 137 167 183 110 207 7 155 ...
## ..$ : int [1:26] 103 54 147 214 15 154 29 125 129 243 ...
## ..$ : int [1:26] 108 22 237 98 198 251 165 72 12 78 ...
## ..$ : int [1:26] 190 63 11 256 140 17 56 94 187 168 ...
## ..$ : int [1:26] 206 132 241 133 246 64 231 77 116 156 ...
## ..$ : int [1:26] 176 218 123 255 113 220 26 199 90 138 ...
## ..$ : int [1:26] 192 83 224 182 75 44 180 85 55 57 ...
## ..$ : int [1:26] 233 178 24 53 185 96 235 82 158 191 ...
## ..$ : int [1:33] 86 238 81 170 248 157 260 144 88 102 ...
## $ best_iteration : int 612
## $ best_ntreelimit: num 612
## - attr(*, "class")= chr "xgb.cv.synchronous"
bst <- xgboost(data = train_b, max_depth = max_depth, eta = eta, nthread = nthread, nrounds = xgbcv$best_iteration, Gamma = Gamma, verbose = 1, print_every_n = 200, early_stopping_rounds = 50, maximize = F )
## [1] train-rmse:30.002407
## Will train until train_rmse hasn't improved in 50 rounds.
##
## [201] train-rmse:6.912491
## [401] train-rmse:3.630333
## [601] train-rmse:2.623341
## [612] train-rmse:2.593215
xgbpre = predict(bst, test_b)
error_matrix(y_test, xgbpre)
## RMSE MAE IQR
## 9.322921 6.824302 8.600402
varstring = "ROAD|pop|temp|wind|RSp|OMI|eleva|coast|I_1|Tropo"
xgboost_LUR(inde_var, max_depth =4, eta =0.02, nthread = 2, nrounds = 2000, y_varname= c("day_value"),training = training, test = test, grepstring = varstring )
## Feature Gain Cover Frequency
## 1: ROAD_M345_3000 0.31047971 0.0366361907 0.035498144
## 2: ROAD_2_50 0.24439537 0.0244392119 0.014189086
## 3: ROAD_M345_25 0.06520737 0.0306984484 0.108020139
## 4: pop1k 0.03796357 0.0176741643 0.017087932
## 5: ROAD_M345_300 0.03727558 0.0586076287 0.060163759
## 6: ROAD_1_5000 0.03262909 0.0338638930 0.030107308
## 7: pop3k 0.02494485 0.0083546052 0.007984539
## 8: ROAD_M345_5000 0.02405547 0.0471630027 0.042974114
## 9: Tropomi_2018 0.01487466 0.0258529376 0.016172507
## 10: I_1_5000 0.01282862 0.0156990023 0.021003916
## 11: ROAD_1_25 0.01281015 0.0026421916 0.001983421
## 12: pop5k 0.01230177 0.0317986938 0.022224483
## 13: ROAD_1_3000 0.01212728 0.0122667459 0.016935361
## 14: ROAD_M345_100 0.01169337 0.0269560114 0.045516961
## 15: wind_speed_10m_10 0.01041657 0.0007919504 0.001830850
## RMSE MAE IQR
## 9.041948 6.634386 7.987956
#xgboost_imp (variabledf = inde_var, y_varname = "day_value", max_depth = 5, eta = 0.02, nthread = 4, nrounds = 2000, training = training, test = test, grepstring = varstring )
spatial correlation of errors of random forest
set.seed(2)
pr = globalLUR::sampledf(merged_mr, fraction = 0.8, country2digit = "CN", grepstring_rm = "ID|countryfullname")
inde_var_train = with(pr, inde_var[training, ])
inde_var_test = with(pr, inde_var[test, ])
pre_mat = subset_grep(inde_var_train, grepstring = "ROAD|pop|value_mean|temp|wind|eleva|coast|I_1|Trop")
rf = ranger(value_mean~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
rf
## Ranger result
##
## Call:
## ranger(value_mean ~ ., data = pre_mat, mtry = 33, num.trees = 2000, importance = "permutation")
##
## Type: Regression
## Number of trees: 2000
## Sample size: 1009
## Number of independent variables: 65
## Mtry: 33
## Target node size: 5
## Variable importance mode: permutation
## Splitrule: variance
## OOB prediction error (MSE): 33.45173
## R squared (OOB): 0.7693244
errordf = with(inde_var_test, data.frame(error = predictions(predict(rf, inde_var_test)) - value_mean, LONGITUDE = LONGITUDE, LATITUDE = LATITUDE))
error_sp = errordf %>% st_as_sf(coords = c("LONGITUDE","LATITUDE")) %>% as_Spatial
dt.vgm = variogram(error~1, error_sp, cutoff = 1)
plot(dt.vgm)
Lasso(inde_var, vis1 = T, y_varname = "day_value", training = training, test=test)
## [1] 0.4473723
## [1] 1.982137
## RMSE MAE IQR
## 8.261252 6.578987 10.156102
Lasso(inde_var, vis1 = T, y_varname = "night_value", training = training, test=test)
## [1] 0.1849945
## [1] 1.3051
## RMSE MAE IQR
## 4.538854 3.799454 6.550015